In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [2234]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected TWTR
In [2235]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [2236]:
pd.set_option('display.max_colwidth', None)
In [2237]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [2238]:
del df['Unnamed: 0']
In [2239]:
df.head(5)
Out[2239]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2020-12-10 46.700001 51.740002 46.540001 51.209999 51.209999 24147200 8.426846 -0.164442 1.700452 1.921873 50.682314 45.466258 48.074286 NaN 3.752925 5.200001 68.790553 NaN NaN NaN 4.779999 NaN 0.102951 83.818273 NaN NaN 56.660995 62.186143 4.364862e+07 7.194901e+06 127226200.0 0.0 7.539123e+06 0.0 0.0 0.0 0.0 0.0 7.539123e+06 7.539123e+06 0.0 7.539123e+06 7.539123e+06 13 522 535 535 0 535 535 535
1 2020-12-11 50.619999 52.450001 49.970001 51.439999 51.439999 18020200 0.449130 -0.150618 2.855194 2.001605 52.055178 45.296250 48.675714 NaN 3.891145 2.480000 72.700360 NaN NaN NaN 4.849998 NaN 0.104100 84.209490 NaN NaN 63.550867 58.157741 4.699105e+07 9.513592e+06 145246400.0 0.0 5.647443e+06 0.0 0.0 0.0 0.0 0.0 5.647443e+06 5.647443e+06 0.0 5.647443e+06 5.647443e+06 10 404 414 414 0 414 414 414
2 2020-12-14 51.970001 53.480000 51.799999 52.020000 52.020000 17418200 1.127531 -0.075895 3.975713 2.007090 53.267838 45.292162 49.280000 NaN 3.858305 2.040001 77.476081 NaN NaN NaN 5.510002 NaN 0.118469 85.182370 NaN NaN 83.893491 68.035118 3.413478e+07 5.506645e+06 162664600.0 0.0 3.651534e+06 0.0 0.0 0.0 0.0 0.0 3.651534e+06 3.651534e+06 0.0 3.651534e+06 3.651534e+06 5 301 306 306 0 306 306 306
3 2020-12-15 52.020000 52.959999 51.610001 52.820000 52.820000 10868700 1.537869 0.054326 4.893990 1.913220 54.431616 45.582670 50.007143 NaN 3.622151 1.349998 71.036875 NaN NaN NaN 6.150002 NaN 0.131776 86.424753 NaN NaN 84.120915 77.188424 4.274924e+07 6.107800e+06 173533300.0 0.0 5.981291e+06 0.0 0.0 0.0 0.0 0.0 5.981291e+06 5.981291e+06 0.0 5.981291e+06 5.981291e+06 9 317 326 326 0 326 326 326
4 2020-12-16 54.770000 56.110001 53.639999 54.029999 54.029999 29099400 2.290797 0.141473 5.804733 2.109903 55.701460 46.064254 50.882857 2.814487 3.905058 3.290001 82.980665 NaN NaN NaN 6.799999 6.033994 0.143976 88.055958 NaN NaN 82.572617 83.529008 2.283910e+07 -5.365707e+05 202632700.0 0.0 1.289854e+07 0.0 0.0 0.0 0.0 0.0 1.289854e+07 1.289854e+07 0.0 1.289854e+07 1.289854e+07 25 628 653 653 0 653 653 653
In [2240]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       260 non-null    datetime64[ns]
 1   Open                       260 non-null    float64       
 2   High                       260 non-null    float64       
 3   Low                        260 non-null    float64       
 4   Close                      260 non-null    float64       
 5   Adj Close                  260 non-null    float64       
 6   Volume                     260 non-null    int64         
 7   Return                     260 non-null    float64       
 8   Beta                       260 non-null    float64       
 9   Variance                   260 non-null    float64       
 10  AvgTrueRange               260 non-null    float64       
 11  Upperband                  260 non-null    float64       
 12  Lowerband                  260 non-null    float64       
 13  Middleband                 260 non-null    float64       
 14  APO                        256 non-null    float64       
 15  NATR                       260 non-null    float64       
 16  TRANGE                     260 non-null    float64       
 17  DMI                        260 non-null    float64       
 18  MACD                       248 non-null    float64       
 19  MACDSIGNAL                 248 non-null    float64       
 20  MACDHIST                   248 non-null    float64       
 21  MOM                        260 non-null    float64       
 22  PPO                        256 non-null    float64       
 23  ROCP                       260 non-null    float64       
 24  RSI                        260 non-null    float64       
 25  TRIX                       193 non-null    float64       
 26  ULTOSC                     253 non-null    float64       
 27  SLOWK                      260 non-null    float64       
 28  SLOWD                      260 non-null    float64       
 29  AD                         260 non-null    float64       
 30  ADOSC                      260 non-null    float64       
 31  OBV                        260 non-null    float64       
 32  Upward_momentum_created    260 non-null    float64       
 33  Downward_momentum_created  260 non-null    float64       
 34  B5_O_Um                    260 non-null    float64       
 35  B5_C_Um                    260 non-null    float64       
 36  B5_E_Um                    260 non-null    float64       
 37  B5_A_Um                    260 non-null    float64       
 38  B5_N_Um                    260 non-null    float64       
 39  B5_O_Dm                    260 non-null    float64       
 40  B5_C_Dm                    260 non-null    float64       
 41  B5_E_Dm                    260 non-null    float64       
 42  B5_A_Dm                    260 non-null    float64       
 43  B5_N_Dm                    260 non-null    float64       
 44  Verified_status_True       260 non-null    int64         
 45  Verified_status_False      260 non-null    int64         
 46  O                          260 non-null    int64         
 47  C                          260 non-null    int64         
 48  E                          260 non-null    int64         
 49  A                          260 non-null    int64         
 50  N                          260 non-null    int64         
 51  Real_or_Fake_tweet         260 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 105.8 KB
In [2241]:
df.shape
Out[2241]:
(260, 52)
In [2242]:
sns.set(font_scale=0.8)
In [2243]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [2244]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [2245]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [2246]:
df.head()
Out[2246]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2020-12-10 46.700001 51.740002 46.540001 51.209999 51.209999 24147200 8.426846 -0.164442 1.700452 1.921873 50.682314 45.466258 48.074286 NaN 3.752925 5.200001 68.790553 NaN NaN NaN 4.779999 NaN 0.102951 83.818273 NaN NaN 56.660995 62.186143 4.364862e+07 7.194901e+06 127226200.0 0.0 7.539123e+06 0.0 0.0 0.0 0.0 0.0 7.539123e+06 7.539123e+06 0.0 7.539123e+06 7.539123e+06 13 522 535 535 0 535 535 535 NaN NaN
1 2020-12-11 50.619999 52.450001 49.970001 51.439999 51.439999 18020200 0.449130 -0.150618 2.855194 2.001605 52.055178 45.296250 48.675714 NaN 3.891145 2.480000 72.700360 NaN NaN NaN 4.849998 NaN 0.104100 84.209490 NaN NaN 63.550867 58.157741 4.699105e+07 9.513592e+06 145246400.0 0.0 5.647443e+06 0.0 0.0 0.0 0.0 0.0 5.647443e+06 5.647443e+06 0.0 5.647443e+06 5.647443e+06 10 404 414 414 0 414 414 414 0.449130 0.004481
2 2020-12-14 51.970001 53.480000 51.799999 52.020000 52.020000 17418200 1.127531 -0.075895 3.975713 2.007090 53.267838 45.292162 49.280000 NaN 3.858305 2.040001 77.476081 NaN NaN NaN 5.510002 NaN 0.118469 85.182370 NaN NaN 83.893491 68.035118 3.413478e+07 5.506645e+06 162664600.0 0.0 3.651534e+06 0.0 0.0 0.0 0.0 0.0 3.651534e+06 3.651534e+06 0.0 3.651534e+06 3.651534e+06 5 301 306 306 0 306 306 306 1.127531 0.011212
3 2020-12-15 52.020000 52.959999 51.610001 52.820000 52.820000 10868700 1.537869 0.054326 4.893990 1.913220 54.431616 45.582670 50.007143 NaN 3.622151 1.349998 71.036875 NaN NaN NaN 6.150002 NaN 0.131776 86.424753 NaN NaN 84.120915 77.188424 4.274924e+07 6.107800e+06 173533300.0 0.0 5.981291e+06 0.0 0.0 0.0 0.0 0.0 5.981291e+06 5.981291e+06 0.0 5.981291e+06 5.981291e+06 9 317 326 326 0 326 326 326 1.537869 0.015262
4 2020-12-16 54.770000 56.110001 53.639999 54.029999 54.029999 29099400 2.290797 0.141473 5.804733 2.109903 55.701460 46.064254 50.882857 2.814487 3.905058 3.290001 82.980665 NaN NaN NaN 6.799999 6.033994 0.143976 88.055958 NaN NaN 82.572617 83.529008 2.283910e+07 -5.365707e+05 202632700.0 0.0 1.289854e+07 0.0 0.0 0.0 0.0 0.0 1.289854e+07 1.289854e+07 0.0 1.289854e+07 1.289854e+07 25 628 653 653 0 653 653 653 2.290797 0.022650
In [2247]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [2248]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [2249]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [2250]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [2251]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [2252]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [2253]:
df.describe()
Out[2253]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 193.000000 193.000000 193.000000 193.000000 193.000000 1.930000e+02 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 1.930000e+02 1.930000e+02 1.930000e+02 193.0 1.930000e+02 193.0 193.0 193.0 193.0 193.0 1.930000e+02 1.930000e+02 193.0 1.930000e+02 1.930000e+02 193.000000 193.000000 193.000000 193.000000 193.0 193.000000 193.000000 193.000000 193.000000 193.000000 164.000000 157.000000
mean 60.505285 61.447046 59.427151 60.379326 60.379326 1.541841e+07 -0.179230 0.681981 3.294950 2.220005 63.875250 57.665934 60.770592 -0.934987 3.742233 2.158808 33.810114 -0.651559 -0.552816 -0.098744 -1.199430 -1.632016 -0.017407 46.011956 0.016613 46.824807 45.804519 45.872320 -3.799530e+07 -4.859257e+06 2.387095e+08 0.0 5.401782e+06 0.0 0.0 0.0 0.0 0.0 5.401782e+06 5.401782e+06 0.0 5.401782e+06 5.401782e+06 11.430052 432.606218 444.036269 444.036269 0.0 444.036269 444.036269 444.036269 -0.179230 -0.002115 0.025427 0.024843
std 7.671426 7.759552 7.704124 7.763769 7.763769 1.060032e+07 2.502432 0.308637 4.844472 0.475926 7.688488 7.419822 7.316007 2.729166 0.957097 1.064089 22.438614 1.787583 1.667597 0.630531 5.519543 4.577350 0.088922 13.537428 0.283532 11.692358 27.795003 26.185347 8.305038e+07 1.425952e+07 1.519723e+08 0.0 8.914790e+06 0.0 0.0 0.0 0.0 0.0 8.914790e+06 8.914790e+06 0.0 8.914790e+06 8.914790e+06 20.376062 387.924875 406.028316 406.028316 0.0 406.028316 406.028316 406.028316 2.502432 0.025579 0.007876 0.007533
min 42.110001 43.099998 41.009998 42.070000 42.070000 5.060100e+06 -15.163613 -0.150069 0.101441 1.452070 45.099871 40.422766 43.732857 -8.063268 2.234642 0.730000 0.084748 -4.255418 -3.607561 -1.701909 -14.999996 -13.219390 -0.230143 17.285661 -0.557911 21.302943 1.156554 4.204780 -2.808373e+08 -5.236925e+07 -2.118365e+08 0.0 1.104128e+06 0.0 0.0 0.0 0.0 0.0 1.104128e+06 1.104128e+06 0.0 1.104128e+06 1.104128e+06 0.000000 148.000000 153.000000 153.000000 0.0 153.000000 153.000000 153.000000 -15.163613 -0.164446 0.013385 0.013385
25% 54.950001 55.150002 53.369999 54.450001 54.450001 9.774200e+06 -1.381470 0.498604 0.896394 1.837097 59.038984 51.887685 56.010000 -2.944679 3.010118 1.480000 15.361975 -2.185198 -1.764581 -0.440687 -4.320000 -4.480445 -0.069706 34.848979 -0.173962 38.510745 20.230609 21.580832 -6.587495e+07 -1.186898e+07 1.806868e+08 0.0 2.191028e+06 0.0 0.0 0.0 0.0 0.0 2.191028e+06 2.191028e+06 0.0 2.191028e+06 2.191028e+06 4.000000 271.000000 277.000000 277.000000 0.0 277.000000 277.000000 277.000000 -1.381470 -0.013911 0.018908 0.018821
50% 62.130001 63.310001 61.650002 62.240002 62.240002 1.258340e+07 -0.262424 0.726328 1.686934 2.152868 65.810410 59.518032 62.884287 -0.630641 3.564226 2.009998 31.796935 -0.624289 -0.514784 -0.129950 -1.290001 -0.995844 -0.022306 46.548253 -0.037885 45.938203 43.597303 47.833869 -8.981551e+06 -3.224471e+06 2.657536e+08 0.0 3.109194e+06 0.0 0.0 0.0 0.0 0.0 3.109194e+06 3.109194e+06 0.0 3.109194e+06 3.109194e+06 6.000000 329.000000 337.000000 337.000000 0.0 337.000000 337.000000 337.000000 -0.262424 -0.002628 0.022128 0.021879
75% 66.360001 67.360001 65.220001 66.110001 66.110001 1.722320e+07 1.309694 0.908082 3.373340 2.562512 70.334040 63.068219 66.367143 0.695961 4.291195 2.629997 47.207023 0.634572 0.579543 0.403072 2.450001 1.045198 0.041000 55.728141 0.211530 53.738403 71.410997 67.818119 9.982250e+06 4.970457e+06 3.118647e+08 0.0 4.955602e+06 0.0 0.0 0.0 0.0 0.0 4.955602e+06 4.955602e+06 0.0 4.955602e+06 4.955602e+06 10.000000 451.000000 459.000000 459.000000 0.0 459.000000 459.000000 459.000000 1.309694 0.013012 0.031369 0.029719
max 72.510002 73.339996 70.730003 72.449997 72.449997 8.837880e+07 5.704781 1.316239 30.764119 3.736077 74.793371 69.340895 71.041429 4.422051 6.765804 10.039997 84.719794 2.926390 2.521570 1.095607 9.990002 6.958423 0.163235 78.566519 0.801773 75.799259 95.400172 94.614510 5.940977e+07 2.248600e+07 5.137223e+08 0.0 8.096984e+07 0.0 0.0 0.0 0.0 0.0 8.096984e+07 8.096984e+07 0.0 8.096984e+07 8.096984e+07 184.000000 4110.000000 4294.000000 4294.000000 0.0 4294.000000 4294.000000 4294.000000 5.704781 0.055480 0.039887 0.039887
In [2254]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [2255]:
df = df.fillna(df.median())
In [2256]:
df.isna().sum()
Out[2256]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [2257]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 67 to 259
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       193 non-null    datetime64[ns]
 1   Open                       193 non-null    float64       
 2   High                       193 non-null    float64       
 3   Low                        193 non-null    float64       
 4   Close                      193 non-null    float64       
 5   Adj Close                  193 non-null    float64       
 6   Volume                     193 non-null    int64         
 7   Return                     193 non-null    float64       
 8   Beta                       193 non-null    float64       
 9   Variance                   193 non-null    float64       
 10  AvgTrueRange               193 non-null    float64       
 11  Upperband                  193 non-null    float64       
 12  Lowerband                  193 non-null    float64       
 13  Middleband                 193 non-null    float64       
 14  APO                        193 non-null    float64       
 15  NATR                       193 non-null    float64       
 16  TRANGE                     193 non-null    float64       
 17  DMI                        193 non-null    float64       
 18  MACD                       193 non-null    float64       
 19  MACDSIGNAL                 193 non-null    float64       
 20  MACDHIST                   193 non-null    float64       
 21  MOM                        193 non-null    float64       
 22  PPO                        193 non-null    float64       
 23  ROCP                       193 non-null    float64       
 24  RSI                        193 non-null    float64       
 25  TRIX                       193 non-null    float64       
 26  ULTOSC                     193 non-null    float64       
 27  SLOWK                      193 non-null    float64       
 28  SLOWD                      193 non-null    float64       
 29  AD                         193 non-null    float64       
 30  ADOSC                      193 non-null    float64       
 31  OBV                        193 non-null    float64       
 32  Upward_momentum_created    193 non-null    float64       
 33  Downward_momentum_created  193 non-null    float64       
 34  B5_O_Um                    193 non-null    float64       
 35  B5_C_Um                    193 non-null    float64       
 36  B5_E_Um                    193 non-null    float64       
 37  B5_A_Um                    193 non-null    float64       
 38  B5_N_Um                    193 non-null    float64       
 39  B5_O_Dm                    193 non-null    float64       
 40  B5_C_Dm                    193 non-null    float64       
 41  B5_E_Dm                    193 non-null    float64       
 42  B5_A_Dm                    193 non-null    float64       
 43  B5_N_Dm                    193 non-null    float64       
 44  Verified_status_True       193 non-null    int64         
 45  Verified_status_False      193 non-null    int64         
 46  O                          193 non-null    int64         
 47  C                          193 non-null    int64         
 48  E                          193 non-null    int64         
 49  A                          193 non-null    int64         
 50  N                          193 non-null    int64         
 51  Fake_news                  193 non-null    int64         
 52  returns                    193 non-null    float64       
 53  log_returns                193 non-null    float64       
 54  vol_current                193 non-null    float64       
 55  vol_future                 193 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 85.9 KB
In [2258]:
df.shape
Out[2258]:
(193, 56)
In [2259]:
df=df.dropna()
In [2260]:
df.dtypes
Out[2260]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [2261]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[2261]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f076ec04dd0>
In [2262]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [2263]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 5 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.812226
TRIX            0.532948
Variance        0.510215
TRANGE          0.500067
Name: AvgTrueRange, dtype: float64
In [2264]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.812226
Volume          0.590530
vol_current     0.532526
vol_future      0.512900
High           -0.502810
Open           -0.507710
MACD           -0.516285
Adj Close      -0.534759
Close          -0.534759
Lowerband      -0.543455
RSI            -0.544967
Low            -0.546965
MOM            -0.547329
ROCP           -0.552908
ADOSC          -0.574041
AD             -0.578288
Name: NATR, dtype: float64
In [2265]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with TRANGE:
TRANGE                       1.000000
Volume                       0.689561
B5_N_Dm                      0.643356
B5_A_Dm                      0.643356
B5_C_Dm                      0.643356
B5_O_Dm                      0.643356
Downward_momentum_created    0.643356
Verified_status_False        0.581601
Fake_news                    0.577342
N                            0.577342
A                            0.577342
C                            0.577342
O                            0.577342
AvgTrueRange                 0.500067
Name: TRANGE, dtype: float64
In [2266]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999722
Verified_status_True         0.893751
B5_N_Dm                      0.891418
B5_A_Dm                      0.891418
B5_C_Dm                      0.891418
B5_O_Dm                      0.891418
Downward_momentum_created    0.891418
Volume                       0.784233
TRANGE                       0.577342
Name: O, dtype: float64
In [2267]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999722
Verified_status_True         0.893751
B5_N_Dm                      0.891418
B5_A_Dm                      0.891418
B5_C_Dm                      0.891418
B5_O_Dm                      0.891418
Downward_momentum_created    0.891418
Volume                       0.784233
TRANGE                       0.577342
Name: C, dtype: float64
In [2268]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [2269]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999722
Verified_status_True         0.893751
B5_N_Dm                      0.891418
B5_A_Dm                      0.891418
B5_C_Dm                      0.891418
B5_O_Dm                      0.891418
Downward_momentum_created    0.891418
Volume                       0.784233
TRANGE                       0.577342
Name: A, dtype: float64
In [2270]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999722
Verified_status_True         0.893751
B5_N_Dm                      0.891418
B5_A_Dm                      0.891418
B5_C_Dm                      0.891418
B5_O_Dm                      0.891418
Downward_momentum_created    0.891418
Volume                       0.784233
TRANGE                       0.577342
Name: N, dtype: float64
In [2271]:
df.columns
Out[2271]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [2272]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2273]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2274]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2275]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2276]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2277]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.891418
N                            0.891418
A                            0.891418
C                            0.891418
O                            0.891418
Verified_status_False        0.887518
Verified_status_True         0.866248
Volume                       0.774264
TRANGE                       0.643356
Name: B5_O_Dm, dtype: float64
In [2278]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.891418
N                            0.891418
A                            0.891418
C                            0.891418
O                            0.891418
Verified_status_False        0.887518
Verified_status_True         0.866248
Volume                       0.774264
TRANGE                       0.643356
Name: B5_C_Dm, dtype: float64
In [2279]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [2280]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_A_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.891418
N                            0.891418
A                            0.891418
C                            0.891418
O                            0.891418
Verified_status_False        0.887518
Verified_status_True         0.866248
Volume                       0.774264
TRANGE                       0.643356
Name: B5_A_Dm, dtype: float64
In [2281]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.891418
N                            0.891418
A                            0.891418
C                            0.891418
O                            0.891418
Verified_status_False        0.887518
Verified_status_True         0.866248
Volume                       0.774264
TRANGE                       0.643356
Name: B5_N_Dm, dtype: float64
In [2282]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999722
Verified_status_True         0.893751
B5_N_Dm                      0.891418
B5_A_Dm                      0.891418
B5_C_Dm                      0.891418
B5_O_Dm                      0.891418
Downward_momentum_created    0.891418
Volume                       0.784233
TRANGE                       0.577342
Name: Fake_news, dtype: float64
In [2283]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Fake_news                    0.891418
N                            0.891418
A                            0.891418
C                            0.891418
O                            0.891418
Verified_status_False        0.887518
Verified_status_True         0.866248
Volume                       0.774264
TRANGE                       0.643356
Name: Downward_momentum_created, dtype: float64
In [2284]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2285]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
Fake_news                    0.893751
N                            0.893751
A                            0.893751
C                            0.893751
O                            0.893751
Verified_status_False        0.882934
B5_N_Dm                      0.866248
B5_A_Dm                      0.866248
B5_C_Dm                      0.866248
B5_O_Dm                      0.866248
Downward_momentum_created    0.866248
Volume                       0.611195
Name: Verified_status_True, dtype: float64
In [2286]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999722
N                            0.999722
A                            0.999722
C                            0.999722
O                            0.999722
B5_N_Dm                      0.887518
B5_A_Dm                      0.887518
B5_C_Dm                      0.887518
B5_O_Dm                      0.887518
Downward_momentum_created    0.887518
Verified_status_True         0.882934
Volume                       0.788728
TRANGE                       0.581601
Name: Verified_status_False, dtype: float64
In [2287]:
sns.set(font_scale=0.8)
In [2288]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [2289]:
df.dtypes
Out[2289]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [2290]:
df.isnull().sum()
Out[2290]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [2291]:
df.fillna(0, inplace = True)
In [2292]:
df.dropna(inplace=True)
In [2293]:
sns.set(font_scale=0.8)
In [2294]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [2295]:
df.describe()
Out[2295]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 193.000000 193.000000 193.000000 193.000000 193.000000 1.930000e+02 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 1.930000e+02 1.930000e+02 1.930000e+02 193.0 1.930000e+02 193.0 193.0 193.0 193.0 193.0 1.930000e+02 1.930000e+02 193.0 1.930000e+02 1.930000e+02 193.000000 193.000000 193.000000 193.000000 193.0 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000 193.000000
mean 60.505285 61.447046 59.427151 60.379326 60.379326 1.541841e+07 -0.179230 0.681981 3.294950 2.220005 63.875250 57.665934 60.770592 -0.934987 3.742233 2.158808 33.810114 -0.651559 -0.552816 -0.098744 -1.199430 -1.632016 -0.017407 46.011956 0.016613 46.824807 45.804519 45.872320 -3.799530e+07 -4.859257e+06 2.387095e+08 0.0 5.401782e+06 0.0 0.0 0.0 0.0 0.0 5.401782e+06 5.401782e+06 0.0 5.401782e+06 5.401782e+06 11.430052 432.606218 444.036269 444.036269 0.0 444.036269 444.036269 444.036269 -0.179230 -0.002115 0.024931 0.024290
std 7.671426 7.759552 7.704124 7.763769 7.763769 1.060032e+07 2.502432 0.308637 4.844472 0.475926 7.688488 7.419822 7.316007 2.729166 0.957097 1.064089 22.438614 1.787583 1.667597 0.630531 5.519543 4.577350 0.088922 13.537428 0.283532 11.692358 27.795003 26.185347 8.305038e+07 1.425952e+07 1.519723e+08 0.0 8.914790e+06 0.0 0.0 0.0 0.0 0.0 8.914790e+06 8.914790e+06 0.0 8.914790e+06 8.914790e+06 20.376062 387.924875 406.028316 406.028316 0.0 406.028316 406.028316 406.028316 2.502432 0.025579 0.007352 0.006888
min 42.110001 43.099998 41.009998 42.070000 42.070000 5.060100e+06 -15.163613 -0.150069 0.101441 1.452070 45.099871 40.422766 43.732857 -8.063268 2.234642 0.730000 0.084748 -4.255418 -3.607561 -1.701909 -14.999996 -13.219390 -0.230143 17.285661 -0.557911 21.302943 1.156554 4.204780 -2.808373e+08 -5.236925e+07 -2.118365e+08 0.0 1.104128e+06 0.0 0.0 0.0 0.0 0.0 1.104128e+06 1.104128e+06 0.0 1.104128e+06 1.104128e+06 0.000000 148.000000 153.000000 153.000000 0.0 153.000000 153.000000 153.000000 -15.163613 -0.164446 0.013385 0.013385
25% 54.950001 55.150002 53.369999 54.450001 54.450001 9.774200e+06 -1.381470 0.498604 0.896394 1.837097 59.038984 51.887685 56.010000 -2.944679 3.010118 1.480000 15.361975 -2.185198 -1.764581 -0.440687 -4.320000 -4.480445 -0.069706 34.848979 -0.173962 38.510745 20.230609 21.580832 -6.587495e+07 -1.186898e+07 1.806868e+08 0.0 2.191028e+06 0.0 0.0 0.0 0.0 0.0 2.191028e+06 2.191028e+06 0.0 2.191028e+06 2.191028e+06 4.000000 271.000000 277.000000 277.000000 0.0 277.000000 277.000000 277.000000 -1.381470 -0.013911 0.019265 0.019265
50% 62.130001 63.310001 61.650002 62.240002 62.240002 1.258340e+07 -0.262424 0.726328 1.686934 2.152868 65.810410 59.518032 62.884287 -0.630641 3.564226 2.009998 31.796935 -0.624289 -0.514784 -0.129950 -1.290001 -0.995844 -0.022306 46.548253 -0.037885 45.938203 43.597303 47.833869 -8.981551e+06 -3.224471e+06 2.657536e+08 0.0 3.109194e+06 0.0 0.0 0.0 0.0 0.0 3.109194e+06 3.109194e+06 0.0 3.109194e+06 3.109194e+06 6.000000 329.000000 337.000000 337.000000 0.0 337.000000 337.000000 337.000000 -0.262424 -0.002628 0.022128 0.021879
75% 66.360001 67.360001 65.220001 66.110001 66.110001 1.722320e+07 1.309694 0.908082 3.373340 2.562512 70.334040 63.068219 66.367143 0.695961 4.291195 2.629997 47.207023 0.634572 0.579543 0.403072 2.450001 1.045198 0.041000 55.728141 0.211530 53.738403 71.410997 67.818119 9.982250e+06 4.970457e+06 3.118647e+08 0.0 4.955602e+06 0.0 0.0 0.0 0.0 0.0 4.955602e+06 4.955602e+06 0.0 4.955602e+06 4.955602e+06 10.000000 451.000000 459.000000 459.000000 0.0 459.000000 459.000000 459.000000 1.309694 0.013012 0.029632 0.028861
max 72.510002 73.339996 70.730003 72.449997 72.449997 8.837880e+07 5.704781 1.316239 30.764119 3.736077 74.793371 69.340895 71.041429 4.422051 6.765804 10.039997 84.719794 2.926390 2.521570 1.095607 9.990002 6.958423 0.163235 78.566519 0.801773 75.799259 95.400172 94.614510 5.940977e+07 2.248600e+07 5.137223e+08 0.0 8.096984e+07 0.0 0.0 0.0 0.0 0.0 8.096984e+07 8.096984e+07 0.0 8.096984e+07 8.096984e+07 184.000000 4110.000000 4294.000000 4294.000000 0.0 4294.000000 4294.000000 4294.000000 5.704781 0.055480 0.039887 0.039887
In [2296]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [2297]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [2298]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [2305]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected TWTR
In [2306]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [2308]:
df.columns
Out[2308]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [2309]:
df.shape
Out[2309]:
(260, 52)
In [2310]:
df.isnull().sum()
Out[2310]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           4
NATR                          0
TRANGE                        0
DMI                           0
MACD                         12
MACDSIGNAL                   12
MACDHIST                     12
MOM                           0
PPO                           4
ROCP                          0
RSI                           0
TRIX                         67
ULTOSC                        7
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [2311]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [2312]:
df_weekly = df.resample('W').agg('mean')
In [2313]:
df_weekly.shape
Out[2313]:
(55, 51)
In [2314]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[2314]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f076ee99090>
In [2315]:
sns.set(font_scale=0.8)
In [2316]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [2317]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 7 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.845533
TRANGE          0.775102
Variance        0.550136
Upperband       0.523391
MACDSIGNAL      0.514932
Volume          0.512590
Name: AvgTrueRange, dtype: float64
In [2318]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 4 strongly correlated values with NATR :
NATR            1.000000
AvgTrueRange    0.845533
Volume          0.642803
TRANGE          0.638717
Name: NATR, dtype: float64
In [2319]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 11 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.775102
Volume                       0.689289
NATR                         0.638717
Verified_status_True         0.554269
B5_N_Dm                      0.541640
B5_A_Dm                      0.541640
B5_C_Dm                      0.541640
B5_O_Dm                      0.541640
Downward_momentum_created    0.541640
MACDSIGNAL                   0.511938
Name: TRANGE, dtype: float64
In [2320]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999774
B5_N_Dm                      0.916414
B5_A_Dm                      0.916414
B5_C_Dm                      0.916414
B5_O_Dm                      0.916414
Downward_momentum_created    0.916414
Verified_status_True         0.916066
Volume                       0.863053
Name: O, dtype: float64
In [2321]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999774
B5_N_Dm                      0.916414
B5_A_Dm                      0.916414
B5_C_Dm                      0.916414
B5_O_Dm                      0.916414
Downward_momentum_created    0.916414
Verified_status_True         0.916066
Volume                       0.863053
Name: C, dtype: float64
In [2322]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: E, dtype: float64)
In [2323]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999774
B5_N_Dm                      0.916414
B5_A_Dm                      0.916414
B5_C_Dm                      0.916414
B5_O_Dm                      0.916414
Downward_momentum_created    0.916414
Verified_status_True         0.916066
Volume                       0.863053
Name: A, dtype: float64
In [2324]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999774
B5_N_Dm                      0.916414
B5_A_Dm                      0.916414
B5_C_Dm                      0.916414
B5_O_Dm                      0.916414
Downward_momentum_created    0.916414
Verified_status_True         0.916066
Volume                       0.863053
Name: N, dtype: float64
In [2325]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2326]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2327]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2328]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2329]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2330]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.933427
Fake_news                    0.916414
N                            0.916414
A                            0.916414
C                            0.916414
O                            0.916414
Verified_status_False        0.911222
Volume                       0.805462
TRANGE                       0.541640
Name: B5_O_Dm, dtype: float64
In [2331]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.933427
Fake_news                    0.916414
N                            0.916414
A                            0.916414
C                            0.916414
O                            0.916414
Verified_status_False        0.911222
Volume                       0.805462
TRANGE                       0.541640
Name: B5_C_Dm, dtype: float64
In [2332]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Dm:
Series([], Name: B5_E_Dm, dtype: float64)
In [2333]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_A_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.933427
Fake_news                    0.916414
N                            0.916414
A                            0.916414
C                            0.916414
O                            0.916414
Verified_status_False        0.911222
Volume                       0.805462
TRANGE                       0.541640
Name: B5_A_Dm, dtype: float64
In [2334]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.933427
Fake_news                    0.916414
N                            0.916414
A                            0.916414
C                            0.916414
O                            0.916414
Verified_status_False        0.911222
Volume                       0.805462
TRANGE                       0.541640
Name: B5_N_Dm, dtype: float64
In [2335]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
A                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999774
B5_N_Dm                      0.916414
B5_A_Dm                      0.916414
B5_C_Dm                      0.916414
B5_O_Dm                      0.916414
Downward_momentum_created    0.916414
Verified_status_True         0.916066
Volume                       0.863053
Name: Fake_news, dtype: float64
In [2336]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
B5_A_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.933427
Fake_news                    0.916414
N                            0.916414
A                            0.916414
C                            0.916414
O                            0.916414
Verified_status_False        0.911222
Volume                       0.805462
TRANGE                       0.541640
Name: Downward_momentum_created, dtype: float64
In [2337]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2338]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_N_Dm                      0.933427
B5_A_Dm                      0.933427
B5_C_Dm                      0.933427
B5_O_Dm                      0.933427
Downward_momentum_created    0.933427
Fake_news                    0.916066
N                            0.916066
A                            0.916066
C                            0.916066
O                            0.916066
Verified_status_False        0.907325
Volume                       0.814479
TRANGE                       0.554269
Name: Verified_status_True, dtype: float64
In [2339]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 13 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999774
N                            0.999774
A                            0.999774
C                            0.999774
O                            0.999774
B5_N_Dm                      0.911222
B5_A_Dm                      0.911222
B5_C_Dm                      0.911222
B5_O_Dm                      0.911222
Downward_momentum_created    0.911222
Verified_status_True         0.907325
Volume                       0.861591
Name: Verified_status_False, dtype: float64
In [2340]:
sns.set(font_scale=0.8)
In [2341]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [2342]:
df_weekly.fillna(0, inplace = True)
In [2343]:
df_weekly.dropna(inplace=True)
In [2344]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [2345]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();